`%>%` <- magrittr::`%>%`
Definition of delta statistic
stat_auc <- function(x, y) {
measure <- c(x, y)
classes <- c(rep("X", length(x)), rep("Y", length(y)))
return(rocauc::auc_by(measure, classes, "Y") - 0.5)
}
apply_stat <- function(dx, dy, var, stat) return(stat(dx[[var]], dy[[var]]))
Plot of English statistics for segments attested with frequency >= 5
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Number of potential consonants by language
ncons_by_lang <- added_consonants %>%
dplyr::select_at(dplyr::vars(-labels, -freq, -scores)) %>%
apply(2, sum) %>%
(function(x) tibble::tibble(nsegs=x, language=names(x)))
Top N languages
N_LANG <- 10
dplyr::arrange(ncons_by_lang, -nsegs) %>% head(N_LANG) %>% print
## # A tibble: 10 x 2
## nsegs language
## <dbl> <chr>
## 1 39 uby
## 2 32 ady
## 3 29 gdo
## 4 28 kbd
## 5 28 lez
## 6 28 tkr
## 7 27 ven
## 8 26 nbl
## 9 25 mrt
## 10 24 ibi
## [[1]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 -0.01026796 -0.4808708
## stat_loc -0.01026796 1.00000000 0.2951242
## stat_glob -0.48087075 0.29512423 1.0000000
##
## [[2]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 0.06164304 -0.4973912
## stat_loc 0.06164304 1.00000000 0.3384304
## stat_glob -0.49739123 0.33843036 1.0000000
##
## [[3]]
## stat_econ stat_loc stat_glob
## stat_econ 1.00000000 -0.08309264 -0.3556031
## stat_loc -0.08309264 1.00000000 -0.1521580
## stat_glob -0.35560314 -0.15215796 1.0000000
##
## [[4]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.15086081 -0.44944164
## stat_loc 0.1508608 1.00000000 0.03661718
## stat_glob -0.4494416 0.03661718 1.00000000
##
## [[5]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.19540347 -0.41207608
## stat_loc 0.1954035 1.00000000 -0.05907608
## stat_glob -0.4120761 -0.05907608 1.00000000
##
## [[6]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.2964218 -0.0524047
## stat_loc 0.2964218 1.0000000 0.1260489
## stat_glob -0.0524047 0.1260489 1.0000000
##
## [[7]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.14095463 -0.28296890
## stat_loc 0.1409546 1.00000000 -0.05849594
## stat_glob -0.2829689 -0.05849594 1.00000000
##
## [[8]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.21024927 -0.27856790
## stat_loc 0.2102493 1.00000000 0.03457613
## stat_glob -0.2785679 0.03457613 1.00000000
##
## [[9]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 0.3340491 -0.5291890
## stat_loc 0.3340491 1.0000000 -0.1597613
## stat_glob -0.5291890 -0.1597613 1.0000000
##
## [[10]]
## stat_econ stat_loc stat_glob
## stat_econ 1.0000000 -0.1144513 -0.3864826
## stat_loc -0.1144513 1.0000000 0.3685129
## stat_glob -0.3864826 0.3685129 1.0000000
Merge the five “common” languages that would work OK (Hindi, Malayalam, Venda, Ndebele, and Kabardian)
stats %>%
dplyr::filter(hin == 1 | mal == 1 | ven == 1 | nbl == 1 | kbd == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi by itself
stats %>%
dplyr::filter(hin == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
write.csv(stats$labels,file="labels.csv")
Kabardian by itself
stats %>%
dplyr::filter(kbd == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Kabardian prime by itself
stats %>%
dplyr::filter(kbd_prime == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Kabardian prime and hindi
stats %>%
dplyr::filter(kbd_prime == 1|hin ==1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi and Kabardian
stats %>%
dplyr::filter(hin == 1 | kbd == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi and Malayalam
stats %>%
dplyr::filter(hin == 1 | mal == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays
Hindi, Malayalam, and Georgian
stats %>%
dplyr::filter(hin == 1 | kat == 1 | mal == 1) %>%
plotly::plot_ly(x=~stat_econ,
y=~stat_loc,
z=~stat_glob,
text=~labels,
color=~log(freq),
type="scatter3d", mode="text") %>%
plotly::add_markers()
## Warning: textfont.color doesn't (yet) support data arrays
## Warning: textfont.color doesn't (yet) support data arrays